pacman::p_load(sf, sfdep, tmap, tidyverse, RColorBrewer, ggplot2, spatstat, jsonlite, units, matrixStats, httr)Take Home Exercise 3
1. Overview
1.1 Introduction
1.2 My Responsibilities
- Data Preparation, Preprocessing
- Explanatory Data Analysis (EDA)
1.3 Importing Packages
Here, we have loaded the following packages:
2. The Data
For this project, we will be using the following data sets.
Singapore Rental Flat Prices (Jan-17 to Sep-24) from data.gov.sg
Master Plan 2014 Subzone Boundary (Web) from data.gov.sg
Hawker Centres Dataset from data.gov.sg
Kindergarten, Childcare, Primary School Datasets from OneMap API
Bus Stops Location, MRT/ LRT Locations from LTA Data Mall
Shopping Malls Coordinates through wikipedia and webscraping with the coordinates retrieved through OneMap API
2.1 Importing Geospatial Data
2.1.1 Importing Singapore Subzone Boundaries
The code chunk below is used to import MP_SUBZONE_WEB_PL shapefile by using st_read() of sfpackages.
mpsz_sf <- st_read(dsn = "data/geospatial/MasterPlan2014SubzoneBoundaryWebSHP/", layer = "MP14_SUBZONE_WEB_PL")Reading layer `MP14_SUBZONE_WEB_PL' from data source
`/Users/georgiaxng/georgiaxng/is415-handson/Take-home_Ex/Take-home_Ex03/data/geospatial/MasterPlan2014SubzoneBoundaryWebSHP'
using driver `ESRI Shapefile'
Simple feature collection with 323 features and 15 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
Projected CRS: SVY21
write_rds(mpsz_sf, 'data/rds/mpsz_sf.rds')Using st_crs, we can check the coordinate system.
st_crs(mpsz_sf)Coordinate Reference System:
User input: SVY21
wkt:
PROJCRS["SVY21",
BASEGEOGCRS["SVY21[WGS84]",
DATUM["World Geodetic System 1984",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]],
ID["EPSG",6326]],
PRIMEM["Greenwich",0,
ANGLEUNIT["Degree",0.0174532925199433]]],
CONVERSION["unnamed",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["Degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["Degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["(E)",east,
ORDER[1],
LENGTHUNIT["metre",1,
ID["EPSG",9001]]],
AXIS["(N)",north,
ORDER[2],
LENGTHUNIT["metre",1,
ID["EPSG",9001]]]]
2.1.1.1 Checking Validity of Geometries
Using st_is_valid, we can check to see whether all the polygons are valid or not. From the results, we can see a total of 9 not valid.
# checks for the number of geometries that are invalid
length(which(st_is_valid(mpsz_sf) == FALSE))[1] 9
To rectify this, we can use st_make_valid() to correct these invalid geometries as demonstrated in the code chunk below.
mpsz_sf <- st_make_valid(mpsz_sf)
length(which(st_is_valid(mpsz_sf) == FALSE))[1] 0
2.1.1.2 Grouping By Town
Since our data is categorised by towns, it will be useful for us to have this done so as well.
# Combine geometries by town in mpsz_sf
combined_town_mpsz_sf <- mpsz_sf %>%
group_by(PLN_AREA_N) %>% # Group by town identifier
summarize(geometry = st_union(geometry), .groups = 'drop')2.1.2 Importing Kindergartens
This chunk of code imports the kindergartens data.
kindergarten_json <- fromJSON("data/geospatial/kindergartens.json")
kindergarten_cleaned <- kindergarten_json$SrchResults[-1, ]
kindergarten_df <- data.frame(
NAME = kindergarten_cleaned$NAME,
latitude = sapply(kindergarten_cleaned$LatLng, function(x) as.numeric(unlist(strsplit(x, ","))[1])),
longitude = sapply(kindergarten_cleaned$LatLng, function(x) as.numeric(unlist(strsplit(x, ","))[2]))
)
kindergarten_sf <- kindergarten_df %>%
st_as_sf(coords = c("longitude", "latitude"), crs=4326) %>%
st_transform(crs = 3414)2.1.3 Importing Childcare
This chunk of code imports the childcare data.
childcare_json <- fromJSON("data/geospatial/childcare.json")
childcare_cleaned <- childcare_json$SrchResults[-1, ]
childcare_df <- data.frame(
NAME = childcare_cleaned$NAME,
latitude = sapply(childcare_cleaned$LatLng, function(x) as.numeric(unlist(strsplit(x, ","))[1])),
longitude = sapply(childcare_cleaned$LatLng, function(x) as.numeric(unlist(strsplit(x, ","))[2]))
)
childcare_sf <- childcare_df %>%
st_as_sf(coords = c("longitude", "latitude"), crs=4326) %>%
st_transform(crs = 3414)2.1.4 Importing Hawker Centre
Similarly here, we will usest_read to read the geojson information, however since the columns values are in the format of html code of ’
’ etc we will need to use a function to apply a regex expression in order to extract the name of the hawker centres.
hawker_sf <- st_read('data/geospatial/HawkerCentresGEOJSON.geojson')Reading layer `HawkerCentresGEOJSON' from data source
`/Users/georgiaxng/georgiaxng/is415-handson/Take-home_Ex/Take-home_Ex03/data/geospatial/HawkerCentresGEOJSON.geojson'
using driver `GeoJSON'
Simple feature collection with 125 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6974 ymin: 1.272716 xmax: 103.9882 ymax: 1.449017
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
# Function to extract name from description
extract_name <- function(description) {
if (!is.na(description)) {
# Use regular expression to extract the NAME
name <- sub(".*<th>NAME</th> <td>(.*?)</td>.*", "\\1", description)
if (name == description) {
return(NA) # Return NA if no match is found
}
return(name)
} else {
return(NA)
}
}
# Apply the extraction function to every row
hawker_sf <- hawker_sf %>%
mutate(Name = sapply(Description, extract_name)) %>% select (-Description)Here, we can see that the hawker centres are now appropriately named.
Simple feature collection with 6 features and 1 field
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.7798 ymin: 1.284425 xmax: 103.9048 ymax: 1.449017
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
Name geometry
1 Market Street Hawker Centre POINT Z (103.8502 1.284425 0)
2 Marsiling Mall Hawker Centre POINT Z (103.7798 1.433539 0)
3 Margaret Drive Hawker Centre POINT Z (103.8047 1.297486 0)
4 Fernvale Hawker Centre & Market POINT Z (103.8771 1.391592 0)
5 One Punggol Hawker Centre POINT Z (103.9048 1.40819 0)
6 Bukit Canberra Hawker Centre POINT Z (103.8225 1.449017 0)
As shown above, we can see that the geographic coordinate system for the hawker dataset is in WGS84 and has XYZ coordinates, among which contains the Z-coordinates we do not need. Thus, we can use st_zm() to remove the Z-coordinate and project it to the SVY21 coordiate system using st_transform().
hawker_sf <- st_zm(hawker_sf) %>%
st_transform(crs = 3414)
head(hawker_sf)Simple feature collection with 6 features and 1 field
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 22042.51 ymin: 29650.7 xmax: 35955.52 ymax: 47850.43
Projected CRS: SVY21 / Singapore TM
Name geometry
1 Market Street Hawker Centre POINT (29874.82 29650.7)
2 Marsiling Mall Hawker Centre POINT (22042.51 46139.03)
3 Margaret Drive Hawker Centre POINT (24816.7 31094.91)
4 Fernvale Hawker Centre & Market POINT (32867.9 41500.77)
5 One Punggol Hawker Centre POINT (35955.52 43336.13)
6 Bukit Canberra Hawker Centre POINT (26794.39 47850.43)
2.1.5 Importing Bus Stops
Here we are importing the bus stop locations using st_read and also converting it to the SVY21 coordinate system.
busstop_sf <- st_read(dsn = "data/geospatial/BusStopLocation_Jul2024/", layer = "BusStop")%>%
st_transform(crs = 3414)Reading layer `BusStop' from data source
`/Users/georgiaxng/georgiaxng/is415-handson/Take-home_Ex/Take-home_Ex03/data/geospatial/BusStopLocation_Jul2024'
using driver `ESRI Shapefile'
Simple feature collection with 5166 features and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 3970.122 ymin: 26482.1 xmax: 48285.52 ymax: 52983.82
Projected CRS: SVY21
2.1.6 Importing Shopping Malls
Here we are importing the shopping mall locations using read_csv and also converting it to the SVY21 coordinate system.
shoppingmall_sf <- read_csv('data/geospatial/shopping_mall_coordinates.csv') %>%
st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs=4326) %>%
st_transform(crs = 3414)2.1.7 Importing MRT
Here we are importing the mrt locations using st_read.
mrt_sf <- st_read(dsn = "data/geospatial/TrainStation_Jul2024/", layer = "RapidTransitSystemStation")Reading layer `RapidTransitSystemStation' from data source
`/Users/georgiaxng/georgiaxng/is415-handson/Take-home_Ex/Take-home_Ex03/data/geospatial/TrainStation_Jul2024'
using driver `ESRI Shapefile'
Simple feature collection with 230 features and 5 fields
Geometry type: POLYGON
Dimension: XY
Bounding box: xmin: 6068.209 ymin: 27478.44 xmax: 45377.5 ymax: 47913.58
Projected CRS: SVY21
Having imported the dataset, we will now need to check for both invalid geometries and NA values before proceeding. The chunk of code below detects not only these but also resolves it. The final printed result shows that all geometries are now valid.
# Check for invalid geometries and NA values
validity_checks <- st_is_valid(mrt_sf, reason = TRUE)
# Identify indices with NA
na_indices <- which(is.na(validity_checks))
# Filter out rows with NA values from the mrt object
mrt_sf <- mrt_sf[-na_indices, ]
# Verify the mrt object no longer contains invalid geometries
any(is.na(sf::st_is_valid(mrt_sf)))[1] FALSE
Here we use st_transform() to convert it to the SVY21 Coordinates System of CRS code 3414.
mrt_sf <- mrt_sf %>%
st_transform(crs = 3414)2.1.8 Importing Primary School
This chunk of code imports the primary school dataset from data.gov.sg and uses the select() function to select the relevant columns through the input of the column numbers.
primarysch_df = read_csv('data/geospatial/Generalinformationofschools.csv') %>% filter(mainlevel_code =='PRIMARY') %>% select(1,3,4)2.1.8.1 Geocoding Primary School Data using OneMap API
Since this dataset only has the addresses and not the actual coordinates, we will need to use the OneMapAPI to geocode these addresses. This chunk of code contains a function whereby the OneMapApi is called upon and returns the actual latitude and longitude of the addresses inputted.
Click to view the code
geocode <- function(address, postal) {
base_url <- "https://www.onemap.gov.sg/api/common/elastic/search"
query <- list("searchVal" = address,
"postal" = postal,
"returnGeom" = "Y",
"getAddrDetails" = "N",
"pageNum" = "1")
res <- GET(base_url, query = query)
restext<-content(res, as="text")
output <- fromJSON(restext) %>%
as.data.frame %>%
select(results.LATITUDE, results.LONGITUDE)
return(output)
}This chunk of code creates two columns for latitude and longitude and sets the default values to 0. Then it loops through every single row of the primary school dataset and calls upon the above function to populate the respective latitude and longitude values for each row.
Click to view the code
primarysch_df$LATITUDE <- 0
primarysch_df$LONGITUDE <- 0
for (i in 1:nrow(primarysch_df)){
temp_output <- geocode(primarysch_df[i, 2], primarysch_df[i, 3])
print(i)
primarysch_df$LATITUDE[i] <- temp_output$results.LATITUDE
primarysch_df$LONGITUDE[i] <- temp_output$results.LONGITUDE
}
write_rds(primarysch_df, 'data/rds/geocoded_primarysch.rds')As shown below, using head() we can see that the new columns for lat and long has been added with the values fetched using the OneMap API.
glimpse(primarysch_df)Rows: 178
Columns: 3
$ school_name <chr> "ADMIRALTY PRIMARY SCHOOL", "AHMAD IBRAHIM PRIMARY SCHOOL"…
$ address <chr> "11 WOODLANDS CIRCLE", "10 YISHUN STREET 11", "100 Br…
$ postal_code <dbl> 738907, 768643, 579646, 159016, 544969, 569785, 569920, 22…
Using read_rds, we can access the already processed and geocoded data from rds without needing to run through the geocoding function again. Since the data is in the WGS coordinate system, we can use st_transform() to project it to the SVY21 coordinate system we will be using.
primarysch_df <- read_rds('data/rds/geocoded_primarysch.rds')
primarysch_sf <- primarysch_df %>%
st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs=4326) %>%
st_transform(crs = 3414)2.1.9 Inferring CBD
Finally, let us factor in the proximity to the Central Business District - in the Downtown Core. For this, let us take the coordinates of Downtown Core to be the coordinates of the CBD:
lat <- 1.287953
lng <- 103.851784
cbd_sf <- data.frame(lat, lng) %>%
st_as_sf(coords = c("lng", "lat"), crs=4326) %>%
st_transform(crs=3414)2.2 Importing Aspatial Data
2.2.1 Importing Rental Flat
The code chunk below is used to import the rental data from data.gov.sg.
rental_df = read_csv('data/aspatial/RentingOutofFlats2024CSV.csv')To get a brief overview of existing columns of this dataset, we can use colnames() to do so.
colnames(rental_df)[1] "rent_approval_date" "town" "block"
[4] "street_name" "flat_type" "monthly_rent"
2.2.1.1 Converting rent_approval_date to a Valid Date Format
Since the rent_approval_date is in the chr format, we will want to convert it to the date format so that we can later better access and use this variable. This is done so by the ym() as shown in the chunk of code below.
rental_df$rent_approval_date <- ym(rental_df$rent_approval_date)2.2.1.2 Filtering For 2024
Since the dataset is rather large, we want to size down our scope and instead focus on only the 2024 data, which in this case is from Jan 2024 to Sep 2024.
rental_df <- rental_df %>%
filter(year(rent_approval_date) == 2024)2.2.1.3 Geocoding Rental Flat Data Using OneMap API
Like the primary school data, we face the similar problem here thus we will need to go through the geocoding process similarly to what we have done above. The geocoding function:
Click to view the code
geocode <- function(block, streetname) {
base_url <- "https://www.onemap.gov.sg/api/common/elastic/search"
address <- paste(block, streetname, sep = " ")
query <- list("searchVal" = address,
"returnGeom" = "Y",
"getAddrDetails" = "N",
"pageNum" = "1")
res <- GET(base_url, query = query)
restext<-content(res, as="text")
output <- fromJSON(restext) %>%
as.data.frame %>%
select(results.LATITUDE, results.LONGITUDE)
return(output)
}This chunk of code then calls upon the above function for every single row of the rental_df and writes it to the rds.
rental_df$LATITUDE <- 0
rental_df$LONGITUDE <- 0
for (i in 1:nrow(rental_df)){
temp_output <- geocode(rental_df[i, 3], rental_df[i, 4])
print(i)
rental_df$LATITUDE[i] <- temp_output$results.LATITUDE
rental_df$LONGITUDE[i] <- temp_output$results.LONGITUDE
}
write_rds(rental_df, 'data/rds/geocoded_rental_2024.rds')Without needing to run the above time-consuming method yet again, we can just read the data from the rds here.
rental_df <- read_rds('data/rds/geocoded_rental_2024.rds')2.2.1.4 CRS Adjustments
Another important step after importing the dataset is checking the coordinate system used, as seen in the result below using st_crs(), we can see that there is no CRS stated for rental_df.
st_crs(rental_df)Coordinate Reference System: NA
Therefore, we need to convert the longitude and latitude columns into a spatial format. Since our dataset is based in Singapore and it uses the SVY21 coordinate reference system (CRS Code: 3414), we will use the st_transform() function to perform the conversion and create the geometry column.
rental_sf <- rental_df %>%
st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs=4326) %>%
st_transform(crs = 3414)Using st_crs(), we can check and verify that the conversion is successful.
st_crs(rental_sf)Coordinate Reference System:
User input: EPSG:3414
wkt:
PROJCRS["SVY21 / Singapore TM",
BASEGEOGCRS["SVY21",
DATUM["SVY21",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4757]],
CONVERSION["Singapore Transverse Mercator",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["northing (N)",north,
ORDER[1],
LENGTHUNIT["metre",1]],
AXIS["easting (E)",east,
ORDER[2],
LENGTHUNIT["metre",1]],
USAGE[
SCOPE["Cadastre, engineering survey, topographic mapping."],
AREA["Singapore - onshore and offshore."],
BBOX[1.13,103.59,1.47,104.07]],
ID["EPSG",3414]]
Simple feature collection with 6 features and 6 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 15786.61 ymin: 30769.77 xmax: 39668.39 ymax: 45634.94
Projected CRS: SVY21 / Singapore TM
# A tibble: 6 × 7
rent_approval_date town block street_name flat_type monthly_rent
<date> <chr> <chr> <chr> <chr> <dbl>
1 2024-01-01 YISHUN 386 YISHUN RING RD 4-ROOM 3700
2 2024-01-01 JURONG WEST 140B CORPORATION DR 4-ROOM 3900
3 2024-01-01 SENGKANG 471B FERNVALE ST 5-ROOM 3700
4 2024-01-01 KALLANG/WHAMPOA 10 GLOUCESTER RD 3-ROOM 3600
5 2024-01-01 BEDOK 31 BEDOK STH AVE… 5-ROOM 4350
6 2024-01-01 QUEENSTOWN 82 STRATHMORE AVE 4-ROOM 3000
# ℹ 1 more variable: geometry <POINT [m]>
2.2.1.5 Checking for NA values
This chunk of code checks the dataset for any na values in all of the columns. As shown below, there is none.
rental_sf %>%
summarise(across(everything(), ~ sum(is.na(.)))) -> extra_NA
extra_NASimple feature collection with 1 feature and 6 fields
Geometry type: MULTIPOINT
Dimension: XY
Bounding box: xmin: 11519.15 ymin: 28097.64 xmax: 45192.3 ymax: 48741.06
Projected CRS: SVY21 / Singapore TM
# A tibble: 1 × 7
rent_approval_date town block street_name flat_type monthly_rent
<int> <int> <int> <int> <int> <int>
1 0 0 0 0 0 0
# ℹ 1 more variable: geometry <MULTIPOINT [m]>
3. Data Wrangling
3.1 Removal of Redundant Columns
To increase efficiency and reduce the data size, we can remove columns we do not need like the block and street_name in which we have already utilised previously and now have no use for.
# Define columns to be removed
columns_to_remove <- c("block","street_name")
# Remove columns only if they exist in the dataframe
rental_sf <- rental_sf %>%
dplyr::select(-all_of(columns_to_remove[columns_to_remove %in% names(rental_sf)]))3.2 Filter By Flat Type
Let us get an overview of the distributions of the housing types. As shown in the histogram, we can see that there is significantly less data for flat types like 1-room, 2-room, and executive housing.
# Create a summary of counts for each remaining lease range
count_data <- rental_sf %>%
group_by(flat_type) %>%
summarise(count = n())
# Create the bar plot with labels
ggplot(count_data, aes(x = flat_type, y = count)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = count), vjust = -0.5, size = 4) + # Add labels on top of the bars
labs(title = "Count of Flat Type",
x = "Flat Type",
y = "Count") +
theme_minimal()Hence, we will focus on analyzing the 3-room, 4-room, and 5-room flats since they show a more substantial presence in the dataset compared to smaller flat types.
rental_sf <- rental_sf %>% filter (flat_type == '3-ROOM' | flat_type == '4-ROOM' |flat_type == '5-ROOM' )3.3 Adding Region to Rental Data
This chunk of code performs a left join with mpsz_sfto categorise the different flats into different regions in order to better understand the rental trends.
3.3.1 Left Joining with mpsz_sf
# Perform the left join by dropping the geometry from 'datab' and only bringing in 'region_n'
rental_sf <- rental_sf %>%
left_join(st_drop_geometry(mpsz_sf) %>% select(PLN_AREA_N, REGION_N) %>% distinct(PLN_AREA_N, .keep_all = TRUE),
by = c("town" = "PLN_AREA_N"))3.3.2 Identifying Rows with NA values
Then, let’s perform a check to see if any of the rows have na values in the newly created column and display it. As shown here we can see that there are multiple rows in which the town column was unable to find a matching value in the mpsz_sf PLN_AREA_N column.
rental_sf_with_na <- rental_sf %>%
filter(is.na(REGION_N))
rental_sf_with_naSimple feature collection with 1471 features and 5 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 19536.43 ymin: 28634.73 xmax: 33665.81 ymax: 41493.47
Projected CRS: SVY21 / Singapore TM
# A tibble: 1,471 × 6
rent_approval_date town flat_type monthly_rent geometry
* <date> <chr> <chr> <dbl> <POINT [m]>
1 2024-01-01 KALLANG/… 3-ROOM 3600 (30102.41 32911.75)
2 2024-01-01 KALLANG/… 3-ROOM 2300 (19536.43 41493.47)
3 2024-01-01 CENTRAL 3-ROOM 3000 (29435.92 29669.46)
4 2024-01-01 KALLANG/… 3-ROOM 1850 (19536.43 41493.47)
5 2024-01-01 KALLANG/… 4-ROOM 3400 (31184.91 34078.85)
6 2024-01-01 KALLANG/… 5-ROOM 4100 (31013.62 33175.3)
7 2024-01-01 KALLANG/… 3-ROOM 3200 (31618.57 33708.83)
8 2024-01-01 KALLANG/… 3-ROOM 2400 (31228.06 33432.22)
9 2024-01-01 CENTRAL 3-ROOM 2850 (29067.24 29360.52)
10 2024-01-01 KALLANG/… 3-ROOM 2700 (31547.37 31835.93)
# ℹ 1,461 more rows
# ℹ 1 more variable: REGION_N <chr>
Using unique, we can identify the town values of these problematic rows and also the available regions in mpsz_sf so that we have a brief idea of what are the possible values we can later use. In particular, the problematic values are ‘Kallang/Whampoa’ and ‘Central’.
unique (rental_sf_with_na$town)[1] "KALLANG/WHAMPOA" "CENTRAL"
unique(mpsz_sf$REGION_N)[1] "CENTRAL REGION" "WEST REGION" "EAST REGION"
[4] "NORTH-EAST REGION" "NORTH REGION"
Since the value is Kallang/Whampoa, let’s try to find the region of either Kallang or Whampoa through the filter()` function.
test <- mpsz_sf%>% filter(PLN_AREA_N == 'KALLANG' | PLN_AREA_N == 'WHAMPOA') %>% select(PLN_AREA_N,REGION_N)
testSimple feature collection with 9 features and 2 fields
Geometry type: POLYGON
Dimension: XY
Bounding box: xmin: 29224.85 ymin: 30694.74 xmax: 33809.38 ymax: 34738.73
Projected CRS: SVY21
PLN_AREA_N REGION_N geometry
1 KALLANG CENTRAL REGION POLYGON ((31632.98 30741.73...
2 KALLANG CENTRAL REGION POLYGON ((31915.99 31851.24...
3 KALLANG CENTRAL REGION POLYGON ((31494.25 32088.46...
4 KALLANG CENTRAL REGION POLYGON ((32710.73 33608.07...
5 KALLANG CENTRAL REGION POLYGON ((31277.37 34723.29...
6 KALLANG CENTRAL REGION POLYGON ((31389.56 32098.17...
7 KALLANG CENTRAL REGION POLYGON ((30344.12 32879.39...
8 KALLANG CENTRAL REGION POLYGON ((32160.87 32549.12...
9 KALLANG CENTRAL REGION POLYGON ((31437.02 32345.27...
While we can’t find a match for Whampoa, we can see that Kallang falls under the Central Region. From the naming, we can also make the deduction that the town ‘Central’ likely falls under the same region. Thus by using a if_else statement we can assign the region Central Region to these towns.
rental_sf <- rental_sf %>%
mutate(REGION_N = if_else(town == 'CENTRAL' | town == 'KALLANG/WHAMPOA', 'CENTRAL REGION', REGION_N))Let us also rename the column to standardise the namings.
rental_sf <- rental_sf %>% rename(region = REGION_N)3.4 Calculate Number of Facilities Within A Certain Distance & Proximity To Nearest Facility
Since the number of facilities within range and proximity to certain facilities are some of the most important factors of rental prices, it is important for us to include that in our analysis as well. Thus to do so we have the below function to made these calculations based on the locations of the different facilities’ datasets we have imported compared with the individual rental flats themselves.
Note: the calculateNumberOffacilities is a parameter used to indicate if the calculation of facilities for a particular facility is required.
Click to view the code
calculate_facilities_and_proximity <- function(dataset1, dataset2, name_of_col_facilities, name_of_col_proximity, radius, calculateNumberOfFacilities) {
# Calculate distance matrix
dist_matrix <- st_distance(dataset1, dataset2) %>%
drop_units()
if (calculateNumberOfFacilities){
# Calculate the number of facilities within the specified radius
dataset1[[name_of_col_facilities]] <- rowSums(dist_matrix <= radius)
}
# Calculate the proximity to the nearest facility
dataset1[[name_of_col_proximity]] <- rowMins(dist_matrix)
return(dataset1)
}The below chunk of code calls upon the calculate_facilities_and_proximity() based on the different parameters stated for each facility. We indicated for the mrt and primary school to not be included in the calculations for the count within a certain radius as the distance to such facilities has way more importance than the actual count of it which is usually one within a certain range since these facilities are more spread out.
Click to view the code
rental_sf <-
calculate_facilities_and_proximity(
rental_sf, kindergarten_sf, "no_of_kindergarten_500m", "prox_kindergarten", 500, TRUE
) %>%
calculate_facilities_and_proximity(
., childcare_sf, "no_of_childcare_500m", "prox_childcare", 500, TRUE
) %>%
calculate_facilities_and_proximity(
., hawker_sf, "no_of_hawker_500m", "prox_hawker", 500, TRUE
) %>%
calculate_facilities_and_proximity(
., busstop_sf, "no_of_busstop_500m", "prox_busstop", 500, TRUE
) %>%
calculate_facilities_and_proximity(
., shoppingmall_sf, "no_of_shoppingmall_1km", "prox_shoppingmall", 1000, TRUE
) %>%
calculate_facilities_and_proximity(
., mrt_sf, "x", "prox_mrt", 1000, FALSE
) %>%
calculate_facilities_and_proximity(
., primarysch_sf, "x", "prox_prisch", 1000, FALSE
) %>%
calculate_facilities_and_proximity(
., cbd_sf, "x", "prox_cbd", 1000, FALSE
)
# Writing to RDS
write_rds(rental_sf,'data/rds/rental_sf.rds')Likewise, to skip the whole time-consuming process, we can instead read the rds data using the below code.
rental_sf <- read_rds('data/rds/rental_sf.rds')
glimpse(rental_sf)Rows: 25,713
Columns: 19
$ rent_approval_date <date> 2024-01-01, 2024-01-01, 2024-01-01, 2024-01-0…
$ town <chr> "YISHUN", "JURONG WEST", "SENGKANG", "KALLANG/…
$ flat_type <chr> "4-ROOM", "4-ROOM", "5-ROOM", "3-ROOM", "5-ROO…
$ monthly_rent <dbl> 3700, 3900, 3700, 3600, 4350, 3000, 3800, 3600…
$ geometry <POINT [m]> POINT (29463.95 45634.94), POINT (15786.…
$ region <chr> "NORTH REGION", "WEST REGION", "NORTH-EAST REG…
$ no_of_kindergarten_500m <dbl> 1, 1, 0, 1, 1, 6, 3, 1, 1, 1, 1, 0, 2, 4, 0, 0…
$ prox_kindergarten <dbl> 3.717727e+02, 1.241314e+02, 6.531547e+02, 4.47…
$ no_of_childcare_500m <dbl> 10, 9, 4, 3, 6, 11, 9, 9, 7, 6, 4, 1, 8, 10, 2…
$ prox_childcare <dbl> 6.318731e+01, 7.642944e+01, 8.264710e+01, 4.47…
$ no_of_hawker_500m <dbl> 1, 0, 0, 1, 2, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0…
$ prox_hawker <dbl> 478.4537, 840.4254, 660.6058, 332.1417, 354.77…
$ no_of_busstop_500m <dbl> 17, 17, 6, 11, 13, 17, 15, 8, 16, 11, 5, 6, 17…
$ prox_busstop <dbl> 174.00119, 80.37739, 70.48567, 161.93848, 280.…
$ no_of_shoppingmall_1km <dbl> 1, 1, 1, 2, 0, 3, 1, 2, 1, 2, 1, 0, 3, 4, 0, 1…
$ prox_shoppingmall <dbl> 704.70468, 905.82230, 735.18898, 565.82028, 10…
$ prox_mrt <dbl> 1259.97262, 1869.01210, 197.18773, 175.98753, …
$ prox_prisch <dbl> 271.15943, 1353.83517, 86.23193, 234.73109, 57…
$ prox_cbd <dbl> 15605.314, 14916.316, 12419.101, 2871.311, 103…
4. Overview Of Dataset
Below is an overview of what our dataset currently consists of.
colnames(rental_sf) [1] "rent_approval_date" "town"
[3] "flat_type" "monthly_rent"
[5] "geometry" "region"
[7] "no_of_kindergarten_500m" "prox_kindergarten"
[9] "no_of_childcare_500m" "prox_childcare"
[11] "no_of_hawker_500m" "prox_hawker"
[13] "no_of_busstop_500m" "prox_busstop"
[15] "no_of_shoppingmall_1km" "prox_shoppingmall"
[17] "prox_mrt" "prox_prisch"
[19] "prox_cbd"
Dependent Variables:
- Monthly Rental:
monthly_rent
Explanatory Variables:
Continuous
Prox_ [distance to closest]: kindergarten, childcare, hawker, bus stops, shopping mall, mrt, primary schools, cbd
Count of xx within xx distance:
no_of_kindergarten_500m,no_of_childcare_500m,no_of_hawker_500m,no_of_busstop_500m,no_of_shoppingmall_1km
Categorical
Flat Type:
flat_typeTown:
townRegion:
region
5. Shiny Storyboard (EDA)
For my part I am in charge of the explanatory data analysis. Thus, I will be
5.1 Histogram
5.2 Chloropeth Map for Rental Prices
4. Histograms
4.1 Categorical Variables
4.1.1 Housing Type
# Create a summary of counts for each remaining lease range
count_data <- rental_sf %>%
group_by(flat_type) %>%
summarise(count = n())
# Create the bar plot with labels
ggplot(count_data, aes(x = flat_type, y = count)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = count), vjust = -0.5, size = 4) + # Add labels on top of the bars
labs(title = "Count of Flat Type",
x = "Flat Type",
y = "Count") +
theme_minimal()Chloropleth Mapping
# Dealing with Kallang/Whampoa & Central town which is not present in mpsz
# Filter based on month and flat type
ch_filtered_rental_sf <- rental_sf %>%
mutate(town = if_else(town == 'KALLANG/WHAMPOA', 'KALLANG', town)) %>%
filter(town != "CENTRAL") %>% filter (month(rent_approval_date) ==1 & flat_type == '5-ROOM')
# Create chloropleth_data
chloropleth_data <- ch_filtered_rental_sf %>%
group_by(town) %>%
summarize(event_count = n(), .groups = 'drop') %>%
st_drop_geometry() %>%
right_join(combined_town_mpsz_sf,
by = c("town" = "PLN_AREA_N")) %>%
st_as_sf() # Convert to sf object after joiningtmap_mode("view")
qtm(chloropleth_data,
fill = "event_count")(n_distinct(rental_sf$town))[1] 26
Drafts
mpsz_sf_main <- st_union(mpsz_sf) %>%
st_cast("POLYGON")
mpsz_sf_main <- mpsz_sf_main[c(12)]
mpsz_sf_owin <- as.owin(mpsz_sf_main)plot(mpsz_sf_owin)tmap_mode('plot')
tm_shape(mpsz_sf%>% filter(PLN_AREA_N == 'ANG MO KIO'))+
tm_polygons()+
tm_shape(rental_sf %>% filter(planning_area_ura == 'ANG MO KIO'))+
tm_dots()